import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn import metrics
A. Import ‘BankChurners.csv’ as DataFrame. [1 Marks]
bc=pd.read_csv('D:\\Great Learning\\FMT\\Project\\BC.csv')
bc.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
B. Check the info of the dataset and also check if any duplicate records in the data. [2 Marks]
bc.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
dup = bc[bc.duplicated()]
dup
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio |
|---|
0 rows × 21 columns
bc.drop_duplicates()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | ... | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | ... | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | ... | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | ... | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | ... | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
10127 rows × 21 columns
There are no duplicate records in the dataframe
C. Print the 5-point summary of the data and share your insights on data distribution. [2 Marks]
bc.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.0 | 7.391776e+08 | 3.690378e+07 | 708082083.0 | 7.130368e+08 | 7.179264e+08 | 7.731435e+08 | 8.283431e+08 |
| Customer_Age | 10127.0 | 4.632596e+01 | 8.016814e+00 | 26.0 | 4.100000e+01 | 4.600000e+01 | 5.200000e+01 | 7.300000e+01 |
| Dependent_count | 10127.0 | 2.346203e+00 | 1.298908e+00 | 0.0 | 1.000000e+00 | 2.000000e+00 | 3.000000e+00 | 5.000000e+00 |
| Months_on_book | 10127.0 | 3.592841e+01 | 7.986416e+00 | 13.0 | 3.100000e+01 | 3.600000e+01 | 4.000000e+01 | 5.600000e+01 |
| Total_Relationship_Count | 10127.0 | 3.812580e+00 | 1.554408e+00 | 1.0 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 | 6.000000e+00 |
| Months_Inactive_12_mon | 10127.0 | 2.341167e+00 | 1.010622e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Contacts_Count_12_mon | 10127.0 | 2.455317e+00 | 1.106225e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Credit_Limit | 10127.0 | 8.631954e+03 | 9.088777e+03 | 1438.3 | 2.555000e+03 | 4.549000e+03 | 1.106750e+04 | 3.451600e+04 |
| Total_Revolving_Bal | 10127.0 | 1.162814e+03 | 8.149873e+02 | 0.0 | 3.590000e+02 | 1.276000e+03 | 1.784000e+03 | 2.517000e+03 |
| Avg_Open_To_Buy | 10127.0 | 7.469140e+03 | 9.090685e+03 | 3.0 | 1.324500e+03 | 3.474000e+03 | 9.859000e+03 | 3.451600e+04 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 7.599407e-01 | 2.192068e-01 | 0.0 | 6.310000e-01 | 7.360000e-01 | 8.590000e-01 | 3.397000e+00 |
| Total_Trans_Amt | 10127.0 | 4.404086e+03 | 3.397129e+03 | 510.0 | 2.155500e+03 | 3.899000e+03 | 4.741000e+03 | 1.848400e+04 |
| Total_Trans_Ct | 10127.0 | 6.485869e+01 | 2.347257e+01 | 10.0 | 4.500000e+01 | 6.700000e+01 | 8.100000e+01 | 1.390000e+02 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 7.122224e-01 | 2.380861e-01 | 0.0 | 5.820000e-01 | 7.020000e-01 | 8.180000e-01 | 3.714000e+00 |
| Avg_Utilization_Ratio | 10127.0 | 2.748936e-01 | 2.756915e-01 | 0.0 | 2.300000e-02 | 1.760000e-01 | 5.030000e-01 | 9.990000e-01 |
The customer age is between 26 to 73. All ages of people are eligible for credit card
The minimum credit limit is around 1400 rs and maximum is 34000 rs.
The average total transaction amount is approximately 4400rs
The for average utilization ratio, the median is closer to 75% (Q3) than 25% (Q1). So the data are left-skewed.
D. Print the unique values of all the categorical columns in the dataset. Share your observations. [2 Marks]
def preprocess():
bc1= bc.select_dtypes(include=['object'])
print(bc1.columns.unique())
for col in bc1.columns:
# to print categories name only
print(bc1[col].value_counts())
preprocess()
Index(['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status',
'Income_Category', 'Card_Category'],
dtype='object')
Existing Customer 8500
Attrited Customer 1627
Name: Attrition_Flag, dtype: int64
F 5358
M 4769
Name: Gender, dtype: int64
Graduate 3128
High School 2013
Uneducated 1487
College 1013
Post-Graduate 516
Doctorate 451
Name: Education_Level, dtype: int64
Married 4687
Single 3943
Divorced 748
Name: Marital_Status, dtype: int64
Less than $40K 3561
$40K - $60K 1790
$80K - $120K 1535
$60K - $80K 1402
abc 1112
$120K + 727
Name: Income_Category, dtype: int64
Blue 9436
Silver 555
Gold 116
Platinum 20
Name: Card_Category, dtype: int64
The female customers hold credit cards more compared to male customers
The Postgraduates and doctrates refrain from using credit cards compared to graduates who use it the most
Married people use credit cards more compared to single people
People with salary less than 40K use credit cards more compared to people who earn more than 40K
The blue card with lowest credit limit is issued to more customers based on their salary limit and platinum card is issued to elite customers
E.Check for the % of missing values in the dataset. Do not treat the missing values here. [1 Mark]
def miss():
print(bc.isnull().sum())
miss()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
def fillnan():
bc["Education_Level"].fillna("Nan", inplace = True)
bc["Marital_Status"].fillna("Nan", inplace = True)
print(bc.Education_Level.value_counts())
print(bc.Marital_Status.value_counts())
fillnan()
Graduate 3128 High School 2013 Nan 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 Married 4687 Single 3943 Nan 749 Divorced 748 Name: Marital_Status, dtype: int64
A. Drop redundant columns from the data. [1 Mark]
bc1 = bc.loc[:,~bc.apply(lambda x: x.duplicated(),axis=1).all()].copy()
bc1
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Nan | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | ... | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | Nan | Divorced | $40K - $60K | Blue | 25 | ... | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | ... | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | Nan | $40K - $60K | Blue | 36 | ... | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | ... | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
10127 rows × 21 columns
No redundant columns in the dataframe
B. Encode the Existing and Attrited customers to 0 and 1 respectively in the ‘Attrition_Flag’ variable. [2 Marks]
Note: Please use inplace=True while replacing values to make permanent changes
def encodeatt():
bc['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
bc['Attrition_Flag'].info()
encodeatt()
<class 'pandas.core.series.Series'> RangeIndex: 10127 entries, 0 to 10126 Series name: Attrition_Flag Non-Null Count Dtype -------------- ----- 10127 non-null int64 dtypes: int64(1) memory usage: 79.2 KB
bc.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | 0 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | 0 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | 0 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | 0 | 40 | F | 4 | High School | Nan | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | 0 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
A. Perform Univariate and Multivariate analysis on the data. Share your insights. [ 3 Marks]
Univariate Analysis
def univariate():
plt.figure(1)
bc.Attrition_Flag.value_counts(normalize=True).plot(kind = 'bar', title = "Attrition",figsize=(4, 3))
plt.figure(2)
bc.Income_Category.value_counts(normalize=True).plot(kind = 'bar', title = "Income_Category",color='red')
plt.figure(3)
bc.Education_Level.value_counts(normalize=True).plot(kind = 'bar', title = "Education_Level",color='green')
univariate()
Multivariate Analysis
def multivariate():
cols = bc.describe().columns[1:]
dtype_mapping = dict(bc.dtypes)
numeric_cols = [ c for c in cols if dtype_mapping[c] !='O' ]
print(numeric_cols)
bc_num=bc.filter(['Attrition_Flag','Customer_Age', 'Dependent_count','Months_on_book', 'Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],axis=1)
import seaborn as sb
cols_to_plot=bc_num.columns[2:21].tolist() + ['Attrition_Flag']
sb.pairplot(bc[cols_to_plot],hue="Attrition_Flag", palette='hls')
plt.show()
corr1=bc_num.corr()
sb.heatmap(corr1, cmap=None, annot=True,annot_kws={"size":5})
multivariate()
['Attrition_Flag', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
Dependant count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon has least correlation with almost all coloumns so they can be dropped
Months_on_book and Customer Age are highly correlated.
Attrition is highly correlated with Months_Inactive_12_mon and Contacts_Count_12_mon
Total_Revolving_Bal and Avg_Utilization_Ratio are highly correlated
A. Find the percentage of outliers, in each column of the data. [2 Marks]
Note: Do not treat outliers here as there will be outliers in real case scenario (in age, the total amount of transactions, number of
transactions, etc) and we would want our model to learn the underlying pattern for such customers.
def outliers():
plt.figure(1)
sb.boxplot(x="Attrition_Flag",y="Avg_Utilization_Ratio",data=bc)
plt.figure(2)
sb.boxplot(x="Attrition_Flag",y="Total_Trans_Amt",data=bc)
plt.figure(3)
sb.boxplot(x="Attrition_Flag",y="Total_Trans_Ct",data=bc)
plt.figure(4)
sb.boxplot(x="Attrition_Flag",y="Customer_Age",data=bc)
outliers1 = bc[bc['Customer_Age'] > bc['Customer_Age'].mean() + 3 * bc['Customer_Age'].std()]
print(outliers1)
outliers2 = bc[bc['Total_Trans_Ct'] > bc['Total_Trans_Ct'].mean() + 3 * bc['Total_Trans_Ct'].std()]
print(outliers2)
outliers3 = bc[bc['Total_Trans_Amt'] > bc['Total_Trans_Amt'].mean() + 3 * bc['Total_Trans_Amt'].std()]
print(outliers3)
outliers()
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \
251 715952883 0 73 M 0
Education_Level Marital_Status Income_Category Card_Category \
251 High School Married $40K - $60K Blue
Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \
251 36 ... 3 2
Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 \
251 4469.0 1125 3344.0 1.363
Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \
251 1765 34 1.615
Avg_Utilization_Ratio
251 0.252
[1 rows x 21 columns]
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \
9324 708163758 0 41 M 3
9586 784868958 0 56 F 1
Education_Level Marital_Status Income_Category Card_Category \
9324 Nan Married $120K + Blue
9586 High School Married abc Blue
Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \
9324 33 ... 4 3
9586 49 ... 2 1
Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \
9324 34516.0 638 33878.0
9586 17542.0 2517 15025.0
Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \
9324 0.724 13085 139
9586 0.800 13939 138
Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
9324 0.675 0.018
9586 0.792 0.143
[2 rows x 21 columns]
CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count \
9106 788709408 0 41 F 2
9138 709108533 0 49 M 4
9143 771062733 0 44 F 2
9145 713552958 0 43 M 5
9151 711814608 0 41 F 5
... ... ... ... ... ...
10116 714109308 0 46 M 5
10117 712503408 0 57 M 2
10120 710841183 0 54 M 1
10121 713899383 0 56 F 1
10122 772366833 0 50 M 2
Education_Level Marital_Status Income_Category Card_Category \
9106 Nan Married abc Blue
9138 Graduate Married $80K - $120K Blue
9143 Graduate Single Less than $40K Blue
9145 Graduate Nan $60K - $80K Blue
9151 High School Married $40K - $60K Silver
... ... ... ... ...
10116 College Single $80K - $120K Blue
10117 Graduate Married $80K - $120K Blue
10120 High School Single $60K - $80K Blue
10121 Graduate Single Less than $40K Blue
10122 Graduate Single $40K - $60K Blue
Months_on_book ... Months_Inactive_12_mon Contacts_Count_12_mon \
9106 21 ... 3 1
9138 29 ... 2 3
9143 31 ... 2 1
9145 28 ... 3 1
9151 36 ... 1 1
... ... ... ... ...
10116 36 ... 2 3
10117 40 ... 3 4
10120 34 ... 2 0
10121 50 ... 1 4
10122 40 ... 2 3
Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy \
9106 14746.0 0 14746.0
9138 9959.0 2216 7743.0
9143 4291.0 1285 3006.0
9145 12773.0 1637 11136.0
9151 19281.0 1805 17476.0
... ... ... ...
10116 13187.0 2241 10946.0
10117 17925.0 1909 16016.0
10120 13940.0 2109 11831.0
10121 3688.0 606 3082.0
10122 4003.0 1851 2152.0
Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \
9106 0.857 14771 127
9138 0.747 15139 90
9143 0.846 15511 101
9145 0.919 14833 114
9151 0.893 15423 110
... ... ... ...
10116 0.689 15354 112
10117 0.712 17498 111
10120 0.660 15577 114
10121 0.570 14596 120
10122 0.703 15476 117
Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
9106 0.628 0.000
9138 0.837 0.223
9143 0.629 0.299
9145 0.583 0.128
9151 0.692 0.094
... ... ...
10116 0.931 0.170
10117 0.820 0.106
10120 0.754 0.151
10121 0.791 0.164
10122 0.857 0.462
[391 rows x 21 columns]
B. Replace 'abc' values with 'np.nan' in Income_Category column. [1 Mark]
def replaceval():
bc2=bc.replace('abc', 'Nan')
print(bc2['Income_Category'].value_counts())
replaceval()
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 Nan 1112 $120K + 727 Name: Income_Category, dtype: int64
C. Separate Target and Predictor variables. [2 Marks]
def targetpredict():
global X,y,bc1
bc1=bc.drop(['CLIENTNUM'],axis=1)
X = bc1.iloc[:,1 :]
y = bc1.iloc[:,0]
return(X,y)
targetpredict()
( Customer_Age Gender Dependent_count Education_Level Marital_Status \
0 45 M 3 High School Married
1 49 F 5 Graduate Single
2 51 M 3 Graduate Married
3 40 F 4 High School Nan
4 40 M 3 Uneducated Married
... ... ... ... ... ...
10122 50 M 2 Graduate Single
10123 41 M 2 Nan Divorced
10124 44 F 1 High School Married
10125 30 M 2 Graduate Nan
10126 43 F 2 Graduate Married
Income_Category Card_Category Months_on_book Total_Relationship_Count \
0 $60K - $80K Blue 39 5
1 Less than $40K Blue 44 6
2 $80K - $120K Blue 36 4
3 Less than $40K Blue 34 3
4 $60K - $80K Blue 21 5
... ... ... ... ...
10122 $40K - $60K Blue 40 3
10123 $40K - $60K Blue 25 4
10124 Less than $40K Blue 36 5
10125 $40K - $60K Blue 36 4
10126 Less than $40K Silver 25 6
Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit \
0 1 3 12691.0
1 1 2 8256.0
2 1 0 3418.0
3 4 1 3313.0
4 1 0 4716.0
... ... ... ...
10122 2 3 4003.0
10123 2 3 4277.0
10124 3 4 5409.0
10125 3 3 5281.0
10126 2 4 10388.0
Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 \
0 777 11914.0 1.335
1 864 7392.0 1.541
2 0 3418.0 2.594
3 2517 796.0 1.405
4 0 4716.0 2.175
... ... ... ...
10122 1851 2152.0 0.703
10123 2186 2091.0 0.804
10124 0 5409.0 0.819
10125 0 5281.0 0.535
10126 1961 8427.0 0.703
Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 \
0 1144 42 1.625
1 1291 33 3.714
2 1887 20 2.333
3 1171 20 2.333
4 816 28 2.500
... ... ... ...
10122 15476 117 0.857
10123 8764 69 0.683
10124 10291 60 0.818
10125 8395 62 0.722
10126 10294 61 0.649
Avg_Utilization_Ratio
0 0.061
1 0.105
2 0.000
3 0.760
4 0.000
... ...
10122 0.462
10123 0.511
10124 0.000
10125 0.000
10126 0.189
[10127 rows x 19 columns],
0 0
1 0
2 0
3 0
4 0
..
10122 0
10123 1
10124 1
10125 1
10126 1
Name: Attrition_Flag, Length: 10127, dtype: int64)
A. Split data into 2 parts, say temporary and test. Please use below line of code. [1 Mark]
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
B. Now split the temporary set into train and validation. Please use below line of code. [1 Mark]
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)
X_train.Income_Category.value_counts()
Less than $40K 2129 $40K - $60K 1059 $80K - $120K 953 $60K - $80K 831 abc 654 $120K + 449 Name: Income_Category, dtype: int64
X_test.Income_Category.value_counts()
Less than $40K 696 $40K - $60K 370 $60K - $80K 292 $80K - $120K 289 abc 237 $120K + 142 Name: Income_Category, dtype: int64
X_val.Income_Category.value_counts()
Less than $40K 736 $40K - $60K 361 $80K - $120K 293 $60K - $80K 279 abc 221 $120K + 136 Name: Income_Category, dtype: int64
X_train.Education_Level.value_counts()
Graduate 1854 High School 1228 Nan 928 Uneducated 881 College 618 Post-Graduate 312 Doctorate 254 Name: Education_Level, dtype: int64
X_test.Education_Level.value_counts()
Graduate 651 High School 381 Uneducated 300 Nan 297 College 196 Post-Graduate 103 Doctorate 98 Name: Education_Level, dtype: int64
X_val.Education_Level.value_counts()
Graduate 623 High School 404 Uneducated 306 Nan 294 College 199 Post-Graduate 101 Doctorate 99 Name: Education_Level, dtype: int64
X_train.Marital_Status.value_counts()
Married 2819 Single 2369 Nan 457 Divorced 430 Name: Marital_Status, dtype: int64
X_test.Marital_Status.value_counts()
Married 908 Single 804 Divorced 162 Nan 152 Name: Marital_Status, dtype: int64
X_val.Marital_Status.value_counts()
Married 960 Single 770 Divorced 156 Nan 140 Name: Marital_Status, dtype: int64
y_train.value_counts()
0 5099 1 976 Name: Attrition_Flag, dtype: int64
def traintest():
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)
A. Impute all missing values in "Education_Level", "Marital_Status", "Income_Category" columns using suitable techniques from
X_train, X_val, X_test separately. [3 Marks]
Note: "Education_Level", "Marital_Status", "Income_Category" are categorical variables.
def Impute():
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_train.Income_Category = ic_imputer.fit_transform(X_train["Income_Category"].values.reshape(-1,1))[:,0]
X_test.Income_Category = ic_imputer.fit_transform(X_test["Income_Category"].values.reshape(-1,1))[:,0]
X_val.Income_Category = ic_imputer.fit_transform(X_val["Income_Category"].values.reshape(-1,1))[:,0]
X_train.Education_Level = ic_imputer.fit_transform(X_train["Education_Level"].values.reshape(-1,1))[:,0]
X_test.Education_Level = ic_imputer.fit_transform(X_test["Education_Level"].values.reshape(-1,1))[:,0]
X_val.Education_Level = ic_imputer.fit_transform(X_val["Education_Level"].values.reshape(-1,1))[:,0]
X_train.Marital_Status = ic_imputer.fit_transform(X_train["Marital_Status"].values.reshape(-1,1))[:,0]
X_test.Marital_Status = ic_imputer.fit_transform(X_test["Marital_Status"].values.reshape(-1,1))[:,0]
X_val.Marital_Status = ic_imputer.fit_transform(X_val["Marital_Status"].values.reshape(-1,1))[:,0]
return()
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_test.Income_Category = ic_imputer.fit_transform(X_test["Income_Category"].values.reshape(-1,1))[:,0]
X_test.Income_Category.value_counts()
Less than $40K 696 $40K - $60K 370 $60K - $80K 292 $80K - $120K 289 abc 237 $120K + 142 Name: Income_Category, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_val.Income_Category = ic_imputer.fit_transform(X_val["Income_Category"].values.reshape(-1,1))[:,0]
X_val.Income_Category.value_counts()
Less than $40K 736 $40K - $60K 361 $80K - $120K 293 $60K - $80K 279 abc 221 $120K + 136 Name: Income_Category, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_train.Education_Level = ic_imputer.fit_transform(X_train["Education_Level"].values.reshape(-1,1))[:,0]
X_train.Education_Level.value_counts()
Graduate 2782 High School 1228 Uneducated 881 College 618 Post-Graduate 312 Doctorate 254 Name: Education_Level, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_test.Education_Level = ic_imputer.fit_transform(X_test["Education_Level"].values.reshape(-1,1))[:,0]
X_test.Education_Level.value_counts()
Graduate 948 High School 381 Uneducated 300 College 196 Post-Graduate 103 Doctorate 98 Name: Education_Level, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_val.Education_Level = ic_imputer.fit_transform(X_val["Education_Level"].values.reshape(-1,1))[:,0]
X_val.Education_Level.value_counts()
Graduate 917 High School 404 Uneducated 306 College 199 Post-Graduate 101 Doctorate 99 Name: Education_Level, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_train.Marital_Status = ic_imputer.fit_transform(X_train["Marital_Status"].values.reshape(-1,1))[:,0]
X_train.Marital_Status.value_counts()
Married 3276 Single 2369 Divorced 430 Name: Marital_Status, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_test.Marital_Status = ic_imputer.fit_transform(X_test["Marital_Status"].values.reshape(-1,1))[:,0]
X_test.Marital_Status.value_counts()
Married 1060 Single 804 Divorced 162 Name: Marital_Status, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X_val.Marital_Status = ic_imputer.fit_transform(X_val["Marital_Status"].values.reshape(-1,1))[:,0]
X_val.Marital_Status.value_counts()
Married 1100 Single 770 Divorced 156 Name: Marital_Status, dtype: int64
A. Encode all Categorical columns in X_train, X_val, X_test using one-hot encoding. [ 3 Marks]
X_train.shape
(6075, 19)
X_test.shape
(2026, 19)
X_val.shape
(2026, 19)
y_train.shape
(6075,)
A. Encode all Categorical columns in X_train, X_val, X_test using one-hot encoding. [ 3 Marks]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
#entel = pd.DataFrame()
object_col = X_train.columns[pd.Series(X_train.columns).apply(lambda x : X_train[x].dtype == 'O')]
for i in X_train.columns:
if i in object_col:
X_train[i] = le.fit_transform(X_train[i])
else:
X_train[i] = X_train[i]
X_train1=X_train.copy()
X_train1
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 800 | 40 | 1 | 2 | 2 | 2 | 0 | 0 | 21 | 6 | 4 | 3 | 20056.0 | 1602 | 18454.0 | 0.466 | 1687 | 46 | 0.533 | 0.080 |
| 498 | 44 | 1 | 1 | 2 | 1 | 5 | 0 | 34 | 6 | 2 | 0 | 2885.0 | 1895 | 990.0 | 0.387 | 1366 | 31 | 0.632 | 0.657 |
| 4356 | 48 | 1 | 4 | 3 | 1 | 3 | 0 | 36 | 5 | 1 | 2 | 6798.0 | 2517 | 4281.0 | 0.873 | 4327 | 79 | 0.881 | 0.370 |
| 407 | 41 | 1 | 2 | 2 | 1 | 2 | 3 | 36 | 6 | 2 | 0 | 27000.0 | 0 | 27000.0 | 0.610 | 1209 | 39 | 0.300 | 0.000 |
| 8728 | 46 | 1 | 4 | 3 | 0 | 1 | 3 | 36 | 2 | 2 | 3 | 15034.0 | 1356 | 13678.0 | 0.754 | 7737 | 84 | 0.750 | 0.090 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 987 | 47 | 0 | 2 | 2 | 2 | 4 | 0 | 36 | 6 | 2 | 2 | 5145.0 | 0 | 5145.0 | 0.693 | 1588 | 35 | 0.944 | 0.000 |
| 5633 | 47 | 1 | 3 | 5 | 2 | 0 | 0 | 36 | 3 | 3 | 4 | 34516.0 | 0 | 34516.0 | 0.517 | 2020 | 33 | 0.375 | 0.000 |
| 9621 | 50 | 0 | 0 | 4 | 0 | 4 | 0 | 36 | 3 | 4 | 1 | 3468.0 | 2221 | 1247.0 | 0.783 | 14222 | 122 | 0.627 | 0.640 |
| 593 | 45 | 0 | 4 | 5 | 1 | 4 | 0 | 38 | 3 | 4 | 3 | 4908.0 | 0 | 4908.0 | 0.993 | 827 | 21 | 0.750 | 0.000 |
| 4035 | 55 | 0 | 2 | 0 | 1 | 5 | 0 | 41 | 6 | 2 | 2 | 7613.0 | 0 | 7613.0 | 0.680 | 4268 | 69 | 0.725 | 0.000 |
6075 rows × 19 columns
object_col = X_test.columns[pd.Series(X_test.columns).apply(lambda x : X_test[x].dtype == 'O')]
for i in X_test.columns:
if i in object_col:
X_test[i] = le.fit_transform(X_test[i])
else:
X_test[i] = X_test[i]
X_test
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9760 | 32 | 1 | 1 | 3 | 2 | 3 | 0 | 26 | 2 | 3 | 2 | 6407.0 | 1130 | 5277.0 | 0.756 | 14471 | 93 | 0.603 | 0.176 |
| 7413 | 50 | 1 | 1 | 4 | 2 | 2 | 0 | 36 | 4 | 3 | 2 | 2317.0 | 0 | 2317.0 | 0.734 | 2214 | 41 | 0.519 | 0.000 |
| 6074 | 54 | 0 | 2 | 3 | 1 | 1 | 0 | 36 | 3 | 3 | 3 | 3892.0 | 0 | 3892.0 | 0.738 | 4318 | 74 | 0.762 | 0.000 |
| 3520 | 61 | 1 | 0 | 5 | 1 | 0 | 0 | 36 | 4 | 3 | 4 | 24172.0 | 2517 | 21655.0 | 0.424 | 1658 | 27 | 0.500 | 0.104 |
| 6103 | 41 | 0 | 3 | 0 | 1 | 1 | 0 | 15 | 5 | 3 | 4 | 4312.0 | 2517 | 1795.0 | 0.741 | 2693 | 56 | 0.436 | 0.584 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1484 | 32 | 1 | 0 | 4 | 1 | 2 | 0 | 26 | 6 | 2 | 3 | 5478.0 | 1491 | 3987.0 | 1.304 | 2456 | 55 | 0.571 | 0.272 |
| 1271 | 32 | 0 | 0 | 3 | 0 | 4 | 0 | 36 | 5 | 2 | 4 | 2699.0 | 1609 | 1090.0 | 0.893 | 2141 | 56 | 0.806 | 0.596 |
| 4002 | 45 | 1 | 2 | 2 | 1 | 0 | 0 | 40 | 4 | 4 | 4 | 16476.0 | 1374 | 15102.0 | 0.459 | 4717 | 71 | 0.690 | 0.083 |
| 8562 | 33 | 0 | 4 | 0 | 1 | 1 | 0 | 26 | 1 | 2 | 3 | 2874.0 | 2517 | 357.0 | 0.730 | 4794 | 88 | 0.600 | 0.876 |
| 413 | 55 | 0 | 2 | 2 | 1 | 4 | 0 | 37 | 3 | 3 | 1 | 2453.0 | 968 | 1485.0 | 1.081 | 2218 | 50 | 0.613 | 0.395 |
2026 rows × 19 columns
object_col = X_val.columns[pd.Series(X_val.columns).apply(lambda x : X_val[x].dtype == 'O')]
for i in X_val.columns:
if i in object_col:
X_val[i] = le.fit_transform(X_val[i])
else:
X_val[i] = X_val[i]
X_val
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2894 | 37 | 1 | 0 | 4 | 2 | 3 | 0 | 27 | 5 | 2 | 3 | 15326.0 | 0 | 15326.0 | 1.159 | 2990 | 55 | 0.964 | 0.000 |
| 9158 | 58 | 1 | 2 | 5 | 2 | 3 | 0 | 46 | 1 | 3 | 1 | 10286.0 | 0 | 10286.0 | 0.908 | 8199 | 59 | 0.903 | 0.000 |
| 9618 | 42 | 1 | 3 | 5 | 1 | 0 | 2 | 23 | 3 | 4 | 3 | 34516.0 | 2070 | 32446.0 | 0.880 | 13781 | 102 | 0.545 | 0.060 |
| 9910 | 47 | 1 | 3 | 2 | 1 | 3 | 0 | 36 | 3 | 2 | 3 | 9683.0 | 1116 | 8567.0 | 0.721 | 15627 | 104 | 0.825 | 0.115 |
| 5497 | 60 | 0 | 1 | 3 | 2 | 4 | 0 | 36 | 5 | 2 | 2 | 2688.0 | 1617 | 1071.0 | 0.552 | 4183 | 71 | 0.614 | 0.602 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 426 | 39 | 1 | 2 | 5 | 2 | 2 | 0 | 36 | 5 | 3 | 3 | 13961.0 | 1299 | 12662.0 | 0.842 | 783 | 20 | 0.667 | 0.093 |
| 5091 | 53 | 0 | 0 | 2 | 2 | 4 | 0 | 40 | 4 | 1 | 4 | 1438.3 | 0 | 1438.3 | 0.503 | 2014 | 53 | 0.472 | 0.000 |
| 500 | 45 | 0 | 4 | 2 | 1 | 5 | 0 | 35 | 6 | 3 | 0 | 2801.0 | 2043 | 758.0 | 0.583 | 1241 | 46 | 0.643 | 0.729 |
| 353 | 53 | 0 | 2 | 2 | 2 | 1 | 0 | 42 | 6 | 2 | 0 | 5332.0 | 0 | 5332.0 | 0.939 | 1371 | 39 | 1.167 | 0.000 |
| 6319 | 38 | 0 | 0 | 2 | 2 | 4 | 0 | 32 | 6 | 3 | 3 | 1755.0 | 750 | 1005.0 | 0.659 | 5000 | 79 | 0.837 | 0.427 |
2026 rows × 19 columns
y_train
800 0
498 0
4356 0
407 0
8728 0
..
987 0
5633 1
9621 0
593 1
4035 0
Name: Attrition_Flag, Length: 6075, dtype: int64
A. Build a Decision Tree, Logistic regression, Random Forest, Gradient Boost and XG Boost model on train data. [ 5 Marks]
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
dt = DecisionTreeRegressor(random_state=1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
dt_score = dt.score(X_test, y_test)
dt_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
dt_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
dt_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['DecisionTree'], 'Accuracy' : [dt_score], 'Precision' : [dt_precision],
'True positive rate' : [dt_recall], 'True negative rate' : [dt_specificity],
'False positive rate' : [1-dt_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | DecisionTree | 0.475932 | 0.765 | 0.809 | 0.952 | 0.048 |
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
lr_score = lr.score(X_test, y_test)
lr_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
lr_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
lr_specificity =round(tn/(tn+fp),3) # Of all the people without Chds, how many were recognised to not have Chd
result = pd.DataFrame({'Model' : ['Logistic Regression'], 'Accuracy' : [lr_score], 'Precision' : [lr_precision],
'True positive rate' : [lr_recall], 'True negative rate' : [lr_specificity],
'False positive rate' : [1-lr_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.890424 | 0.744 | 0.483 | 0.968 | 0.032 |
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
rf_score = rf.score(X_test, y_test)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.960513 | 0.907 | 0.84 | 0.984 | 0.016 |
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5)
gb.fit(X_train, y_train)
pred = gb.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
gb_score = gb.score(X_test, y_test)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
#print("mse Score(gb) : ",mse(y_test, y_pred))
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.977789 | 0.943 | 0.917 | 0.989 | 0.011 |
from xgboost import XGBClassifier
xg = XGBClassifier(learning_rate=0.001,max_depth = 1,n_estimators = 100,scale_pos_weight=5)
xg.fit(X_train, y_train)
pred = xg.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
xg_score = xg.score(X_test, y_test)
xg_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
xg_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
xg_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['XGBClassifier'], 'Accuracy' : [xg_score], 'Precision' : [xg_precision],
'True positive rate' : [xg_recall], 'True negative rate' : [xg_specificity],
'False positive rate' : [1-xg_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | XGBClassifier | 0.733465 | 0.355 | 0.806 | 0.72 | 0.28 |
B. Validate the models built above on Validation set. [ 4 Marks]
y_pred = dt.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
dt_score = dt.score(X_val, y_val)
dt_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
dt_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
dt_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['DecisionTree'], 'Accuracy' : [dt_score], 'Precision' : [dt_precision],
'True positive rate' : [dt_recall], 'True negative rate' : [dt_specificity],
'False positive rate' : [1-dt_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | DecisionTree | 0.546691 | 0.804 | 0.819 | 0.962 | 0.038 |
pred = lr.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val,pred).ravel()
lr_score = lr.score(X_val, y_val)
lr_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
lr_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
lr_specificity =round(tn/(tn+fp),3) # Of all the people without Chds, how many were recognised to not have Chd
result = pd.DataFrame({'Model' : ['Logistic Regression'], 'Accuracy' : [lr_score], 'Precision' : [lr_precision],
'True positive rate' : [lr_recall], 'True negative rate' : [lr_specificity],
'False positive rate' : [1-lr_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.897828 | 0.764 | 0.528 | 0.969 | 0.031 |
pred = rf.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
rf_score = rf.score(X_val, y_val)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.962488 | 0.931 | 0.828 | 0.988 | 0.012 |
pred = gb.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
gb_score = gb.score(X_val, y_val)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.97384 | 0.945 | 0.89 | 0.99 | 0.01 |
pred = xg.predict(X_val)
tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
xg_score = xg.score(X_val, y_val)
xg_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
xg_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
xg_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['XGBClassifier'], 'Accuracy' : [xg_score], 'Precision' : [xg_precision],
'True positive rate' : [xg_recall], 'True negative rate' : [xg_specificity],
'False positive rate' : [1-xg_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | XGBClassifier | 0.74383 | 0.369 | 0.837 | 0.726 | 0.274 |
C. Mention which model is giving us better recall score. [1 Mark]
The Gradient Boosting model gives highest recall compared to all other models
D. Try target balancing with Up sampling the data. Build all the above models on this balanced data. Share your insights on the recall score of the models. [ 4 Marks]
y_train.value_counts()
0 5099 1 976 Name: Attrition_Flag, dtype: int64
train = pd.concat([X_train, y_train], axis = 1)
extra_samples = train[train['Attrition_Flag'] == 1].sample(n = 900, random_state = 1)
train_upsampled = pd.concat([train, extra_samples], axis = 0)
X_train = train_upsampled.drop(columns = 'Attrition_Flag')
y_train = train_upsampled.Attrition_Flag
X_train.shape
y_train.value_counts()
0 5099 1 1876 Name: Attrition_Flag, dtype: int64
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
dt_score = dt.score(X_test, y_test)
dt_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
dt_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
dt_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['DecisionTree'], 'Accuracy' : [dt_score], 'Precision' : [dt_precision],
'True positive rate' : [dt_recall], 'True negative rate' : [dt_specificity],
'False positive rate' : [1-dt_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | DecisionTree | 0.494256 | 0.771 | 0.818 | 0.954 | 0.046 |
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
lr_score = lr.score(X_test, y_test)
lr_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
lr_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
lr_specificity =round(tn/(tn+fp),3) # Of all the people without Chds, how many were recognised to not have Chd
result = pd.DataFrame({'Model' : ['Logistic Regression'], 'Accuracy' : [lr_score], 'Precision' : [lr_precision],
'True positive rate' : [lr_recall], 'True negative rate' : [lr_specificity],
'False positive rate' : [1-lr_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.882034 | 0.626 | 0.658 | 0.925 | 0.075 |
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
rf_score = rf.score(X_test, y_test)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.962981 | 0.896 | 0.871 | 0.981 | 0.019 |
gb.fit(X_train, y_train)
pred = gb.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
gb_score = gb.score(X_test, y_test)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.977295 | 0.921 | 0.938 | 0.985 | 0.015 |
xg.fit(X_train, y_train)
pred = xg.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
xg_score = xg.score(X_test, y_test)
xg_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
xg_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
xg_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['XGBClassifier'], 'Accuracy' : [xg_score], 'Precision' : [xg_precision],
'True positive rate' : [xg_recall], 'True negative rate' : [xg_specificity],
'False positive rate' : [1-xg_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | XGBClassifier | 0.7231 | 0.345 | 0.809 | 0.707 | 0.293 |
E. Try target balancing with Down sampling the data. Build all the above models on this balanced data. Share your insights on the recall score of the models. [ 4 Marks]
bc1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
d1ind = bc1[bc1['Attrition_Flag'] == 0].index
d1 = len(bc1[bc1['Attrition_Flag'] == 0])
print(d1)
d2ind = bc1[bc1['Attrition_Flag'] == 1].index
d2 = len(bc1[bc1['Attrition_Flag'] == 1])
print(d2)
8500 1627
random_indices = np.random.choice(d1ind, d1 - 6500 , replace=False) #Randomly pick up 200 non-diab indices
down_sample_indices = np.concatenate([d2ind,random_indices])
bc1_down_sample = bc1.loc[down_sample_indices] # Extract all those records for diab and non-diab to create new set
bc1_down_sample.shape
bc1_down_sample.groupby(["Attrition_Flag"]).count()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | |||||||||||||||||||
| 0 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 | 2000 |
| 1 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 | 1627 |
X1 = bc1_down_sample.iloc[:,1 :]
y1= bc1_down_sample.iloc[:,0]
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X1.Income_Category = ic_imputer.fit_transform(X1["Income_Category"].values.reshape(-1,1))[:,0]
X1.Income_Category.value_counts()
Less than $40K 1273 $40K - $60K 644 $80K - $120K 539 $60K - $80K 468 abc 414 $120K + 289 Name: Income_Category, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X1.Education_Level = ic_imputer.fit_transform(X1["Education_Level"].values.reshape(-1,1))[:,0]
X1.Education_Level.value_counts()
Graduate 1644 High School 715 Uneducated 533 College 345 Post-Graduate 206 Doctorate 184 Name: Education_Level, dtype: int64
from sklearn.impute import SimpleImputer
ic_imputer = SimpleImputer(missing_values="Nan", strategy="most_frequent")
X1.Marital_Status = ic_imputer.fit_transform(X1["Marital_Status"].values.reshape(-1,1))[:,0]
X1.Marital_Status.value_counts()
Married 1908 Single 1434 Divorced 285 Name: Marital_Status, dtype: int64
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
#entel = pd.DataFrame()
object_col = X1.columns[pd.Series(X1.columns).apply(lambda x : X1[x].dtype == 'O')]
for i in X1.columns:
if i in object_col:
X1[i] = le.fit_transform(X1[i])
else:
X1[i] = X1[i]
X_temp, X_test1, y_temp, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=1, stratify=y1)
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)
dt.fit(X_train1, y_train1)
y_pred = dt.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, y_pred).ravel()
dt_score = dt.score(X_test1, y_test1)
dt_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
dt_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
dt_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['DecisionTree'], 'Accuracy' : [dt_score], 'Precision' : [dt_precision],
'True positive rate' : [dt_recall], 'True negative rate' : [dt_specificity],
'False positive rate' : [1-dt_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | DecisionTree | 0.571304 | 0.874 | 0.893 | 0.895 | 0.105 |
lr.fit(X_train1, y_train1)
pred = lr.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1,pred).ravel()
lr_score = lr.score(X_test1, y_test1)
lr_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
lr_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
lr_specificity =round(tn/(tn+fp),3) # Of all the people without Chds, how many were recognised to not have Chd
result = pd.DataFrame({'Model' : ['Logistic Regression'], 'Accuracy' : [lr_score], 'Precision' : [lr_precision],
'True positive rate' : [lr_recall], 'True negative rate' : [lr_specificity],
'False positive rate' : [1-lr_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.834711 | 0.824 | 0.804 | 0.86 | 0.14 |
rf.fit(X_train1, y_train1)
pred = rf.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
rf_score = rf.score(X_test1, y_test1)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.92562 | 0.91 | 0.926 | 0.925 | 0.075 |
gb.fit(X_train1, y_train1)
pred = gb.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
gb_score = gb.score(X_test1, y_test1)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.940771 | 0.927 | 0.942 | 0.94 | 0.06 |
xg.fit(X_train1, y_train1)
pred = xg.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
xg_score = xg.score(X_test1, y_test1)
xg_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
xg_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
xg_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['XGBClassifier'], 'Accuracy' : [xg_score], 'Precision' : [xg_precision],
'True positive rate' : [xg_recall], 'True negative rate' : [xg_specificity],
'False positive rate' : [1-xg_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | XGBClassifier | 0.760331 | 0.68 | 0.88 | 0.662 | 0.338 |
F. Which data has better performing models, its Original data or Up sampled data or Down Sampled data? Share your insights. [1 Mark]
The recall or True positive rate for Down sampled data
XGBClassifier - 0.896
GradientBoosting - 0.954
RandomForest - 0.939
Logistic Regression - 0.794
DecisionTree - 0.905
The recall or True positive rate for Up sampled data
XGBClassifier - 0.809
GradientBoosting - 0.938
RandomForest - 0.886
Logistic Regression - 0.64
DecisionTree - 0.837
The recall or True positive rate for Original data
XGBClassifier - 0.837
GradientBoosting - 0.89
RandomForest - 0.847
Logistic Regression - 0.552
DecisionTree - 0.813
Interms of recall, Down sampling of data provided best results
G. From the above built models, which 2 models are outperforming with better recall score. Mention and share your insights
Comparing above five models, 1) GradientBoosting and 2) RandomForest algorithms outperform other 3 algorithms for this data.
A. Choose any 2 best performing models from above and tune the hyper parameters of those models. [4 Marks]
rf=RandomForestClassifier( bootstrap = False,
max_depth = 20,
max_features = 'auto' ,
min_samples_leaf = 1,
min_samples_split = 2,
n_estimators = 200, random_state= 1)
The hyperparameters considered for random forest classifier are max_depth, min_samples_split, n_estimators
gb=GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5 )
The hyperparameters considered for gradient boosting classifier are n_estimators, learning_rate,max_features
B. Build the models on train data. [4 Marks]
C. Now validate the models on test data. Check with all performance matrix. Share your insights. [4 Marks]
gb.fit(X_train1, y_train1)
pred = gb.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
gb_score = gb.score(X_test1, y_test1)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.940771 | 0.927 | 0.942 | 0.94 | 0.06 |
gb=GradientBoostingClassifier(n_estimators=850,learning_rate=0.2,random_state=100,max_features=7 )
gb.fit(X_train1, y_train1)
pred = gb.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
gb_score = gb.score(X_test1, y_test1)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | GradientBoosting | 0.949036 | 0.934 | 0.954 | 0.945 | 0.055 |
rf.fit(X_train1, y_train1)
pred = rf.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
rf_score = rf.score(X_test1, y_test1)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
C:\Users\SUBASHINI\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. warn(
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.932507 | 0.918 | 0.933 | 0.932 | 0.068 |
rf=RandomForestClassifier( bootstrap = False,
max_depth = 40,
max_features = 'auto' ,
min_samples_leaf = 1,
min_samples_split = 3,
n_estimators =250, random_state= 1)
rf.fit(X_train1, y_train1)
pred = rf.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
rf_score = rf.score(X_test1, y_test1)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
C:\Users\SUBASHINI\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. warn(
| Model | Accuracy | Precision | True positive rate | True negative rate | False positive rate | |
|---|---|---|---|---|---|---|
| 0 | RandomForest | 0.932507 | 0.924 | 0.926 | 0.938 | 0.062 |
A. Pickle the selected model for future use. [2 Marks]
def gbfit_tune():
gb=GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5 )
gb.fit(X_train1, y_train1)
pred = gb.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
gb_score = gb.score(X_test1, y_test1)
gb_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
gb_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
gb_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['GradientBoosting'], 'Accuracy' : [gb_score], 'Precision' : [gb_precision],
'True positive rate' : [gb_recall], 'True negative rate' : [gb_specificity],
'False positive rate' : [1-gb_specificity]})
result
def rffit_tune():
rf.fit(X_train1, y_train1)
pred = rf.predict(X_test1)
tn, fp, fn, tp = confusion_matrix(y_test1, pred).ravel()
rf_score = rf.score(X_test1, y_test1)
rf_recall = round(tp/(tp+fn), 3) # Of all the people with Chd, how many were recognised to have Chd
rf_precision = round(tp/(tp+fp), 3) # Of all the people predicted to have Chd, how many did have Chd
rf_specificity =round(tn/(tn+fp),3)
result = pd.DataFrame({'Model' : ['RandomForest'], 'Accuracy' : [rf_score], 'Precision' : [rf_precision],
'True positive rate' : [rf_recall], 'True negative rate' : [rf_specificity],
'False positive rate' : [1-rf_specificity]})
result
def main():
gbfit_tune()
rffit_tune()
import dill as pickle
g = pickle.dumps(main)
pickle.loads(g)()
C:\Users\SUBASHINI\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers. warn(
B. Articulate some Business Recommendations and observation from the model. [2 Marks]
Having considered various evaluation metrics , we can conclude that the gradient boosting classifier is the best performing algorithm for our data amongst the other explored models to study and predict customer churn behaviour.
This model can help the bank predict which customers might consider leaving. With such information, the bank can proactively prevent the customer from leaving by addressing the issues the customers might be facing that incentivized them to consider leaving.